This is a documentation for analyses of scRNA-seq data, generated from rat metrial gland tissues on gestational day (GD) 15.5 and 19.5.
We start out by building necessary objects and functions, including the gene list from Ensembl and preprocessing function.
First, build gene and biotype list. Since this object is reused frequently in downstream analysis, I will save this as an R object.

library(dplyr, suppressMessages())
library(Seurat, suppressMessages())
library(patchwork, suppressMessages())
library(ggplot2, suppressMessages())

#1. Build 
mart <- biomaRt::useMart("ensembl", host="http://sep2019.archive.ensembl.org", dataset = "rnorvegicus_gene_ensembl") #to make sure that we use the correct version of Ensembl
rnor6 <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name", "chromosome_name", "start_position", "end_position", "strand", "gene_biotype"), mart = mart) #build attribute table
mt.genes <- subset(rnor6, rnor6$chromosome_name == "MT")$external_gene_name
save(rnor6, file = "/work/LAS/geetu-lab/hhvu/project3_scATAC/rnor6.rna")

Next, build a preprocessing function:

preprocess <- function(dat=rats.rna) {
  #Initialize the Seurat object with the raw (non-normalized data)
  rats.rna <- CreateSeuratObject(counts = dat, project = "RNA", min.cells = 1)
  print("Information of the object before filtering:")
  print(rats.rna)
  
  #Check for chrMT genes
  existingMTgenes <- intersect(rownames(rats.rna), mt.genes)
  rats.rna[["percent.mt"]] <- PercentageFeatureSet(rats.rna, features = existingMTgenes)

  print("Visualize QC metrics as a violin plot:")
  p1 <- VlnPlot(rats.rna, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) + ggtitle(paste0("GD", time, "_", i))
  print(p1)
  
  print("Visualize QC metrics as a scatter plot:")
  p2 <- FeatureScatter(rats.rna, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + ggtitle(paste0("GD", time, "_", i))
  p3 <- FeatureScatter(rats.rna, feature1 = "nFeature_RNA", feature2 = "percent.mt") + ggtitle(paste0("GD", time, "_", i))
  print(p2+p3)
  
  print("nFeature_RNA percentile")
  print(quantile(rats.rna[["nFeature_RNA"]][, 1], probs = seq(0, 1, 0.25)))

  print("nCount_RNA percentile")
  print(quantile(rats.rna[["nCount_RNA"]][, 1], probs = seq(0, 1, 0.25)))

  print("percent.mt percentile")
  print(quantile(rats.rna[["percent.mt"]][, 1], probs = seq(0, 1, 0.25)))
}
dir <- "/work/LAS/geetu-lab/hhvu/project3_scATAC/data/scRNA-seq/"
for (time in c("15.5", "19.5")) {
  if (time == "15.5") {
    for (i in 1:3) {
      dir1 <- paste0(dir, "GD", time, "-RNA/RNA-", time, "-", i)
      rna.data <- Read10X(data.dir = dir1)
      preprocess(rna.data)
    }
  } else {
    for (i in 4:7) {
      dir1 <- paste0(dir, "GD", time, "-RNA/RNA-", time, "-", i)
      rna.data <- Read10X(data.dir = dir1)
      preprocess(rna.data)
    }
  }
}
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 17769 features across 29487 samples within 1 assay 
#> Active assay: RNA (17769 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"
#> [1] "nFeature_RNA percentile"
#>   0%  25%  50%  75% 100% 
#>   47  694  989 1249 3278 
#> [1] "nCount_RNA percentile"
#>    0%   25%   50%   75%  100% 
#>   500  1412  2384  3370 13879 
#> [1] "percent.mt percentile"
#>        0%       25%       50%       75%      100% 
#>  0.440044  5.140187  6.467977  8.437225 84.551148
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')

#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 17663 features across 30757 samples within 1 assay 
#> Active assay: RNA (17663 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"
#> [1] "nFeature_RNA percentile"
#>   0%  25%  50%  75% 100% 
#>   80  602  882 1111 3197 
#> [1] "nCount_RNA percentile"
#>    0%   25%   50%   75%  100% 
#>   500  1183  2074  2908 20919 
#> [1] "percent.mt percentile"
#>        0%       25%       50%       75%      100% 
#>  1.238095  6.740443  8.611482 11.603651 83.697047
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')

#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 17649 features across 18338 samples within 1 assay 
#> Active assay: RNA (17649 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"
#> [1] "nFeature_RNA percentile"
#>      0%     25%     50%     75%    100% 
#>   51.00  685.25 1136.00 1505.00 3486.00 
#> [1] "nCount_RNA percentile"
#>       0%      25%      50%      75%     100% 
#>   500.00  1376.00  2934.00  4664.75 17620.00 
#> [1] "percent.mt percentile"
#>         0%        25%        50%        75%       100% 
#>  0.1926782  4.4835492  5.8217085  8.0157232 92.3540036
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')

#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 18547 features across 15401 samples within 1 assay 
#> Active assay: RNA (18547 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"
#> [1] "nFeature_RNA percentile"
#>   0%  25%  50%  75% 100% 
#>  127  693 1223 1908 4615 
#> [1] "nCount_RNA percentile"
#>    0%   25%   50%   75%  100% 
#>   500  1605  3451  6547 31046 
#> [1] "percent.mt percentile"
#>         0%        25%        50%        75%       100% 
#>  0.1558846  9.0245776 14.3666073 26.5591029 85.6913828
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')

#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 18283 features across 18388 samples within 1 assay 
#> Active assay: RNA (18283 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"
#> [1] "nFeature_RNA percentile"
#>   0%  25%  50%  75% 100% 
#>  113  605 1019 1621 4772 
#> [1] "nCount_RNA percentile"
#>    0%   25%   50%   75%  100% 
#>   576  1390  2691  5346 27989 
#> [1] "percent.mt percentile"
#>         0%        25%        50%        75%       100% 
#>  0.3072197  9.4161959 14.3602042 23.5977471 81.9969743
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')

#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 18606 features across 15444 samples within 1 assay 
#> Active assay: RNA (18606 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"
#> [1] "nFeature_RNA percentile"
#>   0%  25%  50%  75% 100% 
#>  102  637 1135 1868 5736 
#> [1] "nCount_RNA percentile"
#>       0%      25%      50%      75%     100% 
#>   556.00  1538.75  3204.00  6461.25 42688.00 
#> [1] "percent.mt percentile"
#>         0%        25%        50%        75%       100% 
#>  0.3415088  8.7186056 14.3532068 26.3381725 87.2650091
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')

#> [1] "Information of the object before filtering:"
#> An object of class Seurat 
#> 17727 features across 10391 samples within 1 assay 
#> Active assay: RNA (17727 features, 0 variable features)
#> [1] "Visualize QC metrics as a violin plot:"

#> [1] "Visualize QC metrics as a scatter plot:"

#> [1] "nFeature_RNA percentile"
#>     0%    25%    50%    75%   100% 
#>   36.0  304.5 1407.0 2162.0 5261.0 
#> [1] "nCount_RNA percentile"
#>    0%   25%   50%   75%  100% 
#>   500  2991  4734  8571 47645 
#> [1] "percent.mt percentile"
#>         0%        25%        50%        75%       100% 
#>  0.2218388 10.9538249 16.7805443 69.7823748 98.9568241

Based on the preprocessing metrics and plots, we keep cells that have \(500 <\) number of genes \(< 3500\) and have \(\%\) MT genes \(< 20\%\). I will also save the filtered objects for the next steps.

dir <- "/work/LAS/geetu-lab/hhvu/project3_scATAC/"
for (time in c("15.5", "19.5")) {
  if (time == "15.5") {
    for (i in 1:3) {
      dir1 <- paste0(dir, "data/scRNA-seq/GD", time, "-RNA/RNA-", time, "-", i)
      rna.data <- Read10X(data.dir = dir1)
      rats.rna <- CreateSeuratObject(counts = rna.data, project = "RNA", min.cells = 1)
    #Check for chrMT genes
      existingMTgenes <- intersect(rownames(rats.rna), mt.genes)
      rats.rna[["percent.mt"]] <- PercentageFeatureSet(rats.rna, features = existingMTgenes)
      rats.rna <- subset(rats.rna, subset = nFeature_RNA > 500 & nFeature_RNA < 3500 & percent.mt < 20)
      print(rats.rna)
      #save(rats.rna, file = paste0(dir, "scRNA-seq-analysis/1_preprocess/GD", time, "_", i, "-filtered.rda"))
    }
  } else {
    for (i in 4:7) {
      dir1 <- paste0(dir, "data/scRNA-seq/GD", time, "-RNA/RNA-", time, "-", i)
      rna.data <- Read10X(data.dir = dir1)
      rats.rna <- CreateSeuratObject(counts = rna.data, project = "RNA", min.cells = 1)
      #Check for chrMT genes
      existingMTgenes <- intersect(rownames(rats.rna), mt.genes)
      rats.rna[["percent.mt"]] <- PercentageFeatureSet(rats.rna, features = existingMTgenes)
      rats.rna <- subset(rats.rna, subset = nFeature_RNA > 500 & nFeature_RNA < 3500 & percent.mt < 20)
      print(rats.rna)
      #save(rats.rna, file = paste0(dir, "scRNA-seq-analysis/1_preprocess/GD", time, "_", i, "-filtered.rda"))
    }
  }
}
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 17769 features across 25384 samples within 1 assay 
#> Active assay: RNA (17769 features, 0 variable features)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 17663 features across 24845 samples within 1 assay 
#> Active assay: RNA (17663 features, 0 variable features)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 17649 features across 15613 samples within 1 assay 
#> Active assay: RNA (17649 features, 0 variable features)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 18547 features across 8664 samples within 1 assay 
#> Active assay: RNA (18547 features, 0 variable features)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 18283 features across 10641 samples within 1 assay 
#> Active assay: RNA (18283 features, 0 variable features)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 18606 features across 8750 samples within 1 assay 
#> Active assay: RNA (18606 features, 0 variable features)
#> Warning: Feature names cannot have underscores ('_'), replacing with dashes
#> ('-')
#> An object of class Seurat 
#> 17727 features across 5562 samples within 1 assay 
#> Active assay: RNA (17727 features, 0 variable features)

In conclusion, we have the following metrics before and after preprocessing.

sum <- read.table("/work/LAS/geetu-lab/hhvu/project3_scATAC/scRNA-seq-analysis/MetrialGland-scRNA-seq/Files/0_prepocess/0_preprocessingSum.txt", header = T, sep = "\t")
library(knitr)
kable(sum, caption = "Preprocessing summary")
Preprocessing summary
Sample Before.filtering After.filtering
15.5_1 17769 genes across 29487 cells 17769 genes across 25384 cells
15.5_2 17663 genes across 30757 cells 17663 genes across 24845 cells
15.5_3 17649 genes across 18338 cells 17649 genes across 15613 cells
19.5_4 18547 genes across 15401 cells 18547 genes across 8664 cells
19.5_5 18283 genes across 18388 cells 18283 genes across 10641 cells
19.5_6 18606 genes across 15444 cells 18606 genes across 8750 cells
19.5_7 17727 genes across 10391 cells 17727 genes across 5562 cells
sessionInfo()
#> R version 4.0.2 (2020-06-22)
#> Platform: x86_64-pc-linux-gnu (64-bit)
#> Running under: Ubuntu 20.04.4 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
#> LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] knitr_1.39         ggplot2_3.3.4      patchwork_1.0.1    SeuratObject_4.0.4
#> [5] Seurat_4.1.0       dplyr_1.0.7       
#> 
#> loaded via a namespace (and not attached):
#>  [1] BiocFileCache_1.12.1  plyr_1.8.6            igraph_1.2.6         
#>  [4] lazyeval_0.2.2        splines_4.0.2         listenv_0.8.0        
#>  [7] scattermore_0.7       digest_0.6.29         htmltools_0.5.2      
#> [10] fansi_1.0.3           magrittr_2.0.3        memoise_1.1.0        
#> [13] tensor_1.5            cluster_2.1.0         ROCR_1.0-11          
#> [16] globals_0.13.0        matrixStats_0.57.0    askpass_1.1          
#> [19] spatstat.sparse_2.0-0 prettyunits_1.1.1     colorspace_2.0-3     
#> [22] blob_1.2.3            rappdirs_0.3.3        ggrepel_0.8.2        
#> [25] xfun_0.30             crayon_1.5.1          jsonlite_1.8.0       
#> [28] spatstat.data_2.2-0   survival_3.2-7        zoo_1.8-8            
#> [31] glue_1.6.2            polyclip_1.10-0       gtable_0.3.0         
#> [34] leiden_0.3.3          future.apply_1.6.0    BiocGenerics_0.34.0  
#> [37] abind_1.4-5           scales_1.2.0          DBI_1.1.2            
#> [40] miniUI_0.1.1.1        Rcpp_1.0.8            viridisLite_0.4.0    
#> [43] xtable_1.8-4          progress_1.2.2        reticulate_1.16      
#> [46] spatstat.core_2.3-1   bit_4.0.4             stats4_4.0.2         
#> [49] htmlwidgets_1.5.2     httr_1.4.2            RColorBrewer_1.1-3   
#> [52] ellipsis_0.3.2        ica_1.0-2             farver_2.1.0         
#> [55] pkgconfig_2.0.3       XML_3.99-0.5          sass_0.4.1           
#> [58] uwot_0.1.10           dbplyr_2.1.1          deldir_1.0-6         
#> [61] utf8_1.2.2            labeling_0.4.2        tidyselect_1.1.2     
#> [64] rlang_1.0.2           reshape2_1.4.4        later_1.1.0.1        
#> [67] AnnotationDbi_1.50.3  munsell_0.5.0         tools_4.0.2          
#> [70] cli_3.3.0             generics_0.1.2        RSQLite_2.2.1        
#> [73] ggridges_0.5.2        evaluate_0.15         stringr_1.4.0        
#>  [ reached getOption("max.print") -- omitted 58 entries ]